library(mosaic)
library(tidyverse)
library(lubridate)
library(DataComputing)
library(rvest)
library(broom)
As COVID-19 spreads at an alarming rate, a pressing question at a global scale emerges– what factors of a country contribute to the spread of Coronavirus. The factors which we will analyze are: population density, and proximity to origin point (China).
Reading in the Data:
Data Source 1: COVID
COVID <- read.csv(file = "total-covid-cases-deaths-per-million.csv")
COVID
COVID %>%
nrow()
[1] 9487
COVID %>%
names()
[1] "total.covid.cases.deaths.per.million" "X"
[3] "X.1" "X.2"
[5] "X.3" "X.4"
[7] "X.5" "X.6"
[9] "X.7" "X.8"
[11] "X.9" "X.10"
[13] "X.11" "X.12"
[15] "X.13" "X.14"
[17] "X.15" "X.16"
[19] "X.17" "X.18"
[21] "X.19" "X.20"
[23] "X.21" "X.22"
[25] "X.23" "X.24"
[27] "X.25" "X.26"
[29] "X.27" "X.28"
[31] "X.29" "X.30"
[33] "X.31" "X.32"
[35] "X.33" "X.34"
[37] "X.35" "X.36"
[39] "X.37" "X.38"
[41] "X.39" "X.40"
[43] "X.41" "X.42"
[45] "X.43" "X.44"
[47] "X.45" "X.46"
[49] "X.47" "X.48"
[51] "X.49" "X.50"
[53] "X.51" "X.52"
[55] "X.53" "X.54"
[57] "X.55" "X.56"
[59] "X.57" "X.58"
[61] "X.59" "X.60"
[63] "X.61" "X.62"
[65] "X.63" "X.64"
[67] "X.65" "X.66"
[69] "X.67" "X.68"
[71] "X.69" "X.70"
[73] "X.71" "X.72"
[75] "X.73" "X.74"
[77] "X.75" "X.76"
[79] "X.77" "X.78"
[81] "X.79" "X.80"
[83] "X.81" "X.82"
[85] "X.83" "X.84"
[87] "X.85" "X.86"
[89] "X.87" "X.88"
[91] "X.89" "X.90"
[93] "X.91" "X.92"
[95] "X.93" "X.94"
[97] "X.95" "X.96"
[99] "X.97" "X.98"
[101] "X.99" "X.100"
[103] "X.101" "X.102"
[105] "X.103" "X.104"
[107] "X.105" "X.106"
[109] "X.107" "X.108"
[111] "X.109" "X.110"
[113] "X.111" "X.112"
[115] "X.113" "X.114"
[117] "X.115" "X.116"
[119] "X.117" "X.118"
[121] "X.119" "X.120"
[123] "X.121" "X.122"
[125] "X.123" "X.124"
[127] "X.125" "X.126"
[129] "X.127" "X.128"
[131] "X.129" "X.130"
[133] "X.131" "X.132"
[135] "X.133" "X.134"
[137] "X.135" "X.136"
[139] "X.137" "X.138"
[141] "X.139" "X.140"
[143] "X.141" "X.142"
[145] "X.143" "X.144"
[147] "X.145" "X.146"
[149] "X.147" "X.148"
[151] "X.149" "X.150"
[153] "X.151" "X.152"
[155] "X.153" "X.154"
[157] "X.155" "X.156"
[159] "X.157" "X.158"
[161] "X.159" "X.160"
[163] "X.161" "X.162"
[165] "X.163" "X.164"
[167] "X.165" "X.166"
[169] "X.167" "X.168"
[171] "X.169" "X.170"
[173] "X.171" "X.172"
[175] "X.173" "X.174"
[177] "X.175" "X.176"
[179] "X.177" "X.178"
[181] "X.179" "X.180"
[183] "X.181" "X.182"
[185] "X.183" "X.184"
[187] "X.185" "X.186"
[189] "X.187" "X.188"
[191] "X.189" "X.190"
[193] "X.191" "X.192"
[195] "X.193" "X.194"
[197] "X.195" "X.196"
[199] "X.197" "X.198"
[201] "X.199" "X.200"
[203] "X.201" "X.202"
[205] "X.203" "X.204"
[207] "X.205" "X.206"
[209] "X.207" "X.208"
[211] "X.209" "X.210"
[213] "X.211" "X.212"
[215] "X.213" "X.214"
[217] "X.215" "X.216"
[219] "X.217" "X.218"
[221] "X.219" "X.220"
[223] "X.221" "X.222"
[225] "X.223" "X.224"
[227] "X.225" "X.226"
[229] "X.227" "X.228"
[231] "X.229" "X.230"
[233] "X.231" "X.232"
[235] "X.233" "X.234"
[237] "X.235" "X.236"
[239] "X.237" "X.238"
[241] "X.239" "X.240"
[243] "X.241" "X.242"
[245] "X.243" "X.244"
[247] "X.245" "X.246"
[249] "X.247" "X.248"
[251] "X.249" "X.250"
[253] "X.251" "X.252"
[255] "X.253" "X.254"
COVID %>%
head()
Data Source 2: CountryData
CountryData
CountryData %>%
nrow()
[1] 256
CountryData %>%
names()
[1] "country" "area" "pop" "growth" "birth"
[6] "death" "migr" "maternal" "infant" "life"
[11] "fert" "health" "HIVrate" "HIVpeople" "HIVdeath"
[16] "obesity" "underweight" "educ" "unemploymentYouth" "GDP"
[21] "GDPgrowth" "GDPcapita" "saving" "indProd" "labor"
[26] "unemployment" "family" "tax" "budget" "debt"
[31] "inflation" "discount" "lending" "narrow" "broad"
[36] "credit" "shares" "balance" "exports" "imports"
[41] "gold" "externalDebt" "homeStock" "abroadStock" "elecProd"
[46] "elecCons" "elecExp" "elecImp" "elecCap" "elecFossil"
[51] "elecNuc" "elecHydro" "elecRenew" "oilProd" "oilExp"
[56] "oilImp" "oilRes" "petroProd" "petroCons" "petroExp"
[61] "petroImp" "gasProd" "gasCons" "gasExp" "gasImp"
[66] "gasRes" "mainlines" "cell" "netHosts" "netUsers"
[71] "airports" "railways" "roadways" "waterways" "marine"
[76] "military"
CountryData %>%
head()
Data Source 3: Continents
Continents <- read.csv(file = "countries and continents.csv")
Continents
Continents %>%
nrow()
[1] 251
Continents %>%
names()
[1] "name" "official_name_en" "official_name_fr"
[4] "ISO3166.1.Alpha.2" "ISO3166.1.Alpha.3" "M49"
[7] "ITU" "MARC" "WMO"
[10] "DS" "Dial" "FIFA"
[13] "FIPS" "GAUL" "IOC"
[16] "ISO4217.currency_alphabetic_code" "ISO4217.currency_country_name" "ISO4217.currency_minor_unit"
[19] "ISO4217.currency_name" "ISO4217.currency_numeric_code" "is_independent"
[22] "Capital" "Continent" "TLD"
[25] "Languages" "Geoname.ID" "EDGAR"
Continents %>%
head()
COVID
Since we are soley focused on the spread of COVID-19, filter out death count.
TidyCOVID <- COVID %>%
rename(country = total.covid.cases.deaths.per.million ) %>%
rename( Code = X ) %>%
rename(Date = X.1 ) %>%
rename(CasesPerMillion = X.3) %>%
filter(row_number() > 1) %>%
subset(select = c(1,3,5)) %>%
mutate( country = as.character(country) ) %>%
mutate(Date = mdy(Date)) %>%
mutate(CasesPerMillion = as.integer(CasesPerMillion) - 1)
TidyCOVID
RelevantCountryData <-
CountryData %>%
subset(select = c(1,2,3)) %>%
mutate(popdensity = round(pop/area, digits = 2))
MasterData <- left_join(TidyCOVID, RelevantCountryData)
Joining, by = "country"
MasterData <-
MasterData %>%
filter(country != "Africa",
country != "Asia",
country != "Europe",
country != "North America",
country != "Oceania",
country != "South America",
country != "World"
) %>%
mutate("Cases" = (CasesPerMillion * round(pop/1000000, digits = 0)))
MasterData
FirstInstance <-
MasterData %>%
filter(Cases != 0) %>%
group_by(country) %>%
summarise(beginningofspread = min(Date))
FirstInstance
DailySpread <-
left_join(MasterData, FirstInstance) %>%
filter(Date == "2020-04-05") %>%
mutate(dayselapsed = Date - beginningofspread) %>%
mutate(dailyspread = Cases / as.numeric(dayselapsed) ) %>%
arrange(desc(dailyspread))
Joining, by = "country"
MasterData <-
left_join(MasterData, DailySpread)
Joining, by = c("country", "Date", "CasesPerMillion", "area", "pop", "popdensity", "Cases")
ggplot(data=MasterData,aes(x=pop,y=dailyspread))+geom_point()
MasterData
MasterData %>%
group_by(Date) %>%
summarise(totalcases = sum(Cases, na.rm=T)) %>%
ggplot(aes(x=Date,y=totalcases))+geom_point()